from crawlers.userAgent import useragent
import requests
from lxml import etree
import random
import threading
import time
import sys

class IP(object):
    def __init__(self):
        self.url='https://www.kuaidaili.com/free/inha/{}/'   # 爬取ip代理的网址，快代理
        self.proxies_list=[]
        self.url2='https://www.baidu.com'  # 用百度网址用来测试ip是否可用
        self.userfulIP=[]  # 可用的ip

    def progressBar(self):  # 进度条函数
        for i in range(1, 101):
            sys.stdout.write('\r')
            sys.stdout.write('{0}% |{1}'.format(int(i % 101), int((i % 101)/2) * '■'))
            sys.stdout.flush()
            time.sleep(0.125)
        sys.stdout.write('\n')

    def check_ip(self,ips):
        while True:
            if len(ips)==0:
                break
            try:
                ip=ips.pop()
                headers = {'User-Agent': useragent().getUserAgent()}
                rsp=requests.get(url=self.url2,headers=headers,proxies=ip,timeout=0.2) # 设置超时时间
                if(rsp.status_code==200):
                    self.userfulIP.append(ip)
                    # print('========IP{}可用'.format(ip))  # 测试需要的话可以不注释掉
                    time.sleep(1)   # 休眠1秒钟

            except Exception as e:
                print(e)

    def get_UsefulIp(self):  # 通过使用多线程用来测试爬取的ip是否可用
        self.get_ips()  # 首先爬取5页ip
        ips=self.proxies_list
        threadList=[]  # 线程列表
        for i in range(5): # 创建5个线程
            thread=threading.Thread(target=self.check_ip,args=(ips,))
            thread.start()
            threadList.append(thread)

        for thread in threadList:
            thread.join()

        self.progressBar()
        print('IP test completed!')
        print('(The number of available IPs is:[%d])'%len(self.userfulIP))

        return self.userfulIP

    def get_ips(self):
        page=random.randrange(1,3000)
        for i in range(page,page+5):  # 可以根据实际要求更改
            print('=======>Requesting page-{}'.format(i))
            url=self.url.format(page)
            headers={'User-Agent':useragent().getUserAgent()}
            response=requests.get(url=url,headers=headers)
            HTML=etree.HTML(response.text)
            infos=HTML.xpath("//table[@class='table table-bordered table-striped']/tbody/tr")

            for info in infos:
                proxies_dict={}
                ip=info.xpath('./td[1]/text()')[0]  # ip地址
                ip_port=info.xpath('./td[2]/text()')[0]  # ip的端口
                type=info.xpath('./td[4]/text()')[0]  # ip的类型 http https
                proxies_dict[type.lower()]='{}://{}:{}'.format(type.lower(),ip,ip_port)
                self.proxies_list.append(proxies_dict)
            # 设置请求完成一次之后，休眠几秒，防止出现请求过快
            time.sleep(3)
        print('(The total number of IP crawled is:[%d])'%len(self.proxies_list))

